Run this script on vulcan, it will save a pickle dictionary containing links for all files related to each participant:
import os
import subprocess
import pickle
participants = {}
participant_numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 67, 69, 70, 71, 72, 73, 74, 76, 77, 78, 82, 87]
for num in participant_numbers:
participants['PGPC_'+str(num).zfill(4)] = []
for a in range(0, 3000):
try:
link = "https://personalgenomes.ca/v1/public/files/"+str(a)+"/download"
output = subprocess.check_output("wget --spider --server-response "+link+" 2>&1 | grep -i content-disposition", shell=True)
output = str(output)
fn = output.split('"')[1]
if 'md5sum' not in output and 'fastq.gz' in output:
for participant in participants:
if participant in fn:
participants[participant] = participants[participant]+[link]
except:
do_nothing = True
with open('participant_links.dict', 'wb') as f:
pickle.dump(participants, f)
This didn’t get all of them because where most are e.g. 0001, some of them are e.g. 54 or 51, so checking the file names didn’t work for all. Modifying the script to get the rest:
import os
import subprocess
import pickle
with open('participant_links.dict', 'rb') as f:
participants = pickle.load(f)
already_saved = []
for participant in participants:
if participants[participant] != []:
already_saved = already_saved+participants[participant]
all_links = {}
for a in range(0, 3000):
try:
link = "https://personalgenomes.ca/v1/public/files/"+str(a)+"/download"
if link in already_saved: continue
output = subprocess.check_output("wget --spider --server-response "+link+" 2>&1 | grep -i content-disposition", shell=True)
output = str(output)
fn = output.split('"')[1]
if 'md5sum' not in output and 'fastq.gz' in output:
for participant in participants:
if participant in fn:
participants[participant] = participants[participant]+[link]
elif participant.replace('0', '') in fn:
participants[participant] = participants[participant]+[link]
all_links[link] = fn
except:
do_nothing = True
with open('participant_links_2.dict', 'wb') as f:
pickle.dump(participants, f)
with open('all_links.dict', 'wb') as f:
pickle.dump(all_links, f)
This time I ended up adding to too many because I removed too many zeroes when checking which participant to add the link to, so I fixed it like this:
import os
import subprocess
import pickle
with open('/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/run_on_server/sort_links/participant_links_2.dict', 'rb') as f:
participants = pickle.load(f)
with open('/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/run_on_server/sort_links/all_links.dict', 'rb') as f:
all_links = pickle.load(f)
new_participants = {}
for participant in participants:
new_links = []
for link in participants[participant]:
if link in all_links:
file = all_links[link]
if '.md5' not in file and participant in file:
print(participant, file)
elif '.md5' not in file and 'PGPC_'+participant[7:] in file:
new_links.append(link)
else:
new_links.append(link)
new_participants[participant] = new_links
with open('/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/run_on_server/sort_links/participant_links_3.dict', 'wb') as f:
pickle.dump(new_participants, f)
I checked several at random to see that the number of files was correct and it all looks good.
import os
import subprocess
import pickle
import numpy as np
from Bio import SeqIO
threads = '40'
with open('participant_links_3.dict', 'rb') as f:
participants = pickle.load(f)
def print_and_log(message, log):
log.append(message)
print(message)
return log
def split_file(file, num_records, total):
records = SeqIO.parse(file, "fasta")
count, count_file, part, new_file = 0, 1, 1, []
fn = file.replace('.fasta', '')
files = []
for record in records:
new_file.append(record)
count += 1
count_file += 1
if count == num_records or count_file == total:
SeqIO.write(new_file, fn+'_'+str(part)+'.fasta', "fasta")
files.append(fn+'_'+str(part)+'.fasta')
count, new_file = 0, []
part += 1
return files
def download_files(directory, links, log):
log = print_and_log('Downloading links to directory '+directory, log)
files = []
for link in links:
log = print_and_log('Getting '+link, log)
#if we already have this file, continue to the next link
output = subprocess.check_output("wget --spider --server-response "+link+" 2>&1 | grep -i content-disposition", shell=True)
output = str(output)
fn = output.split('"')[1]
fn = fn.replace('_001.', '.')
log = print_and_log('Checking for file '+fn, log)
if os.path.exists(directory+'/'+fn):
files.append(fn)
log = print_and_log('Already have '+fn, log)
continue
log = print_and_log("Didn't have it, so downloading "+fn+" from "+link, log)
command = 'wget '+link+' -O '+directory+'/'+fn
os.system(command)
if not os.path.exists(directory+'/'+fn): log = print_and_log("Didn't get "+fn+" from link "+link, log)
else:
files.append(fn)
log = print_and_log("Got "+fn+" from link "+link, log)
#Sort the files into pairs of R1 and R2
files = sorted(list(set(files)))
sorted_files, already_added = [], []
for file in files:
if file in already_added: continue
if file.replace('R1', 'R2') in files:
sorted_files.append(sorted([file, file.replace('R1', 'R2')]))
already_added = already_added+[file, file.replace('R1', 'R2')]
log = print_and_log('New pair: '+file+', '+file.replace('R1', 'R2'), log)
elif file.replace('R2', 'R1') in files:
sorted_files.append(sorted([file, file.replace('R2', 'R1')]))
already_added = already_added+[file, file.replace('R2', 'R1')]
log = print_and_log('New pair: '+file+', '+file.replace('R2', 'R1'), log)
return log, sorted_files
def check_and_split_file(directory, files, log):
#for each file set - R1 and R2 pair - check if the size is larger than 10 GB. If it is, add it to the list of files that need splitting
log = print_and_log('Checking for files that need splitting for '+directory, log)
need_splitting = []
for file_set in files:
size1, size2 = os.stat(directory+'/'+file_set[0]).st_size, os.stat(directory+'/'+file_set[1]).st_size
if size1/1000000000 > 10 or size2/1000000000 > 10:
need_splitting.append(file_set+[size1/1000000000])
new_files = []
if need_splitting == []:#if it's not, just return the log file and the file list
log = print_and_log("Didn't need to split any files for "+directory, log)
return log, files
else:
for file_set in files:
if file_set not in need_splitting:
new_files.append(file_set)
for file_set in need_splitting:
#unzip both files
os.system('gunzip '+directory+'/'+file_set[0])
os.system('gunzip '+directory+'/'+file_set[1])
f1, f2 = directory+'/'+file_set[0].replace('.gz', ''), directory+'/'+file_set[1].replace('.gz', '')
#check if number of records is the same in each
count1, count2 = 0, 0
for rec in SeqIO.parse(f1, "fastq"):
count1 += 1
for rec in SeqIO.parse(f2, "fastq"):
count2 += 1
if count1 != count2:
log = print_and_log("Couldn't split files "+file_set[0]+" and "+file_set[1]+" because they didn't have the same number of lines. "+file_set[0]+" has "+str(count1)+" lines while "+file_set[1]+" has "+str(count2)+" lines", log)
os.system('gzip '+f1)
os.system('gzip '+f2)
continue
else:
num_files = np.ceil(file_set[2]/10)
records_per_file = np.ceil(count1/num_files)
first = split_file(f1, records_per_file, count1)
log = print_and_log('Split the files '+f1+' into '+str(num_files)+' files', log)
second = split_file(f2, records_per_file, count1)
log = print_and_log('Split the files '+f2+' into '+str(num_files)+' files', log)
these_files = []
for f in first:
if f.replace('R1', 'R2') in second:
log = print_and_log('gzipping '+f, log)
os.system('gzip '+f)
log = print_and_log('gzipping '+f.replace('R1', 'R2'), log)
os.system('gzip '+f.replace('R1', 'R2'))
these_files.append(sorted([f+'.gz', f.replace('R1', 'R2')+'.gz']))
elif f.replace('R2', 'R1') in second:
log = print_and_log('gzipping '+f, log)
os.system('gzip '+f)
log = print_and_log('gzipping '+f.replace('R2', 'R1'), log)
os.system('gzip '+f.replace('R2', 'R1'))
these_files.append(sorted([f+'.gz', f.replace('R2', 'R1')+'.gz']))
log = print_and_log("Successfully split all parts of "+file_set[:2], log)
new_files = new_files+these_files
for f in range(len(new_files)):
if '/' in new_files[f]:
new_files[f] = new_files[f].split('/')[1]
return log, new_files
def run_kneaddata(directory, files, log):
for file_set in files:
f1, f2 = directory+'/'+file_set[0], directory+'/'+file_set[1]
log = print_and_log('Checking for previous kneaddata file', log)
if not os.path.exists(directory+'/'+'kneaddata_out/'+file_set[0].replace('.fastq.qz', '')+'_kneaddata_paired.fastq.gz'):
log = print_and_log("Didn't have "+directory+'/'+'kneaddata_out/'+file_set[0].replace('.fastq.qz', '')+'_kneaddata_paired.fastq.gz', log)
log = print_and_log('Unzipping '+f1+' and '+f2, log)
os.system('gunzip '+f1)
os.system('gunzip '+f2)
f1, f2 = f1.replace('.gz', ''), f2.replace('.gz', '')
log = print_and_log('Running kneaddata with '+f1+' and '+f2, log)
command = 'kneaddata -i '+f1+' -i '+f2+' -o '+directory+'/kneaddata_out/ -db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ -t '+threads+' --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" --bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output'
os.system(command)
log = print_and_log('Zipping '+f1+' and '+f2, log)
os.system('gzip '+f1)
os.system('gzip '+f2)
output_list = os.listdir(directory+'/kneaddata_out/')
if output_list == 1: log = print_and_log("Something happened and kneaddata didnt run properly with "+f1+' and '+f2, log)
else: log = print_and_log('Looks like kneaddata at least created some files', log)
for out in output_list:
if '.log' not in out and 'kneaddata_paired' not in out:
os.system('rm '+directory+'/'+out)
log = print_and_log('Removed '+directory+'/'+out, log)
elif '.log' in out:
log = print_and_log('Got a kneaddata log file ', log)
elif 'kneaddata_paired' in out:
if '.gz' not in out:
os.system('gzip '+directory+'/kneaddata_out/'+out)
log = print_and_log('Got and zipped '+out, log)
return log
def remove_fasta(directory, files, log):
log = print_and_log('Beginning to remove fasta files from '+directory, log)
for file in files:
try:
os.system('rm '+directory+'/'+file)
log = print_and_log('Removed '+directory+'/'+file, log)
except:
log = print_and_log("Didn't manage to remove "+directory+'/'+file, log)
log = print_and_log('Removed all the files we could from '+directory, log)
return log
def write_logfile(directory, log):
with open(directory+'/log.txt', 'w') as f:
for row in log:
f.write(row+'\n')
return
for participant in participants:
log = []
if participant != 'PGPC_0002': continue
if os.path.exists(participant):
have_knead = False
if os.path.exists(participant+'/kneaddata_out/'):
for file in os.listdir(participant+'/kneaddata_out/'):
if 'kneaddata_paired' in file:
have_knead = True
break
if not have_knead:
log = print_and_log('Already had the directory for '+participant+', but no kneaddata_paired file so continuing', log)
else:
log = print_and_log('Already have at least one kneaddata_paired file for '+participant+', so moving onto the next participant', log)
write_logfile(participant, log)
break
else:
os.system('mkdir '+participant)
log = print_and_log('Made the directory for '+participant, log)
links = list(set(participants[participant]))
try:
#Download all files to the new directory, returning the log file and a list of the files sorted into R1 and R2 pairs
log = print_and_log('Starting the download function for '+participant, log)
log, sorted_files = download_files(participant, links, log)
log = print_and_log('Finished the download function for '+participant, log)
except:
log = print_and_log('Something happened with the download function for '+participant, log)
write_logfile(participant, log)
continue
try:
#Split the files to smaller files if needed
log, sorted_files = check_and_split_file(participant, sorted_files, log)
log = print_and_log('Finished the split function for '+participant, log)
except:
log = print_and_log('Something happened with the split function for '+participant, log)
write_logfile(participant, log)
continue
try:
#Run kneaddata on each file set
log = run_kneaddata(participant, sorted_files, log)
log = print_and_log('Finished the kneaddata function for '+participant, log)
except:
log = print_and_log('Something happened with the kneaddata function for '+participant, log)
write_logfile(participant, log)
continue
try:
#Remove the fasta files
remove_fasta(participant, sorted_files, log)
except:
log = print_and_log('Something happened with the remove fasta function for '+participant, log)
write_logfile(participant, log)
continue
#Write log file
write_logfile(participant, log)
Keep getting kneaddata/trimmomatic/java errors. Tried running kneaddata directly and still get an error:
kneaddata -i PGPC_0002_S1_L001_R1.fastq -i PGPC_0002_S1_L001_R2.fastq -o kneaddata_out/ -db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ -t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" --bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output
Reformatting file sequence identifiers ...
Reformatting file sequence identifiers ...
Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersr9ltMI_PGPC_0002_S1_L001_R1 ): 71760200
Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersrAyq4P_PGPC_0002_S1_L001_R2 ): 71760200
Running Trimmomatic ...
CRITICAL ERROR: Error executing: java -Xmx500m -jar /home/robyn/tools/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 20 -phred33 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersr9ltMI_PGPC_0002_S1_L001_R1 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersrAyq4P_PGPC_0002_S1_L001_R2 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.single.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.2.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.single.2.fastq SLIDINGWINDOW:4:20 MINLEN:50
Error message returned from Trimmomatic :
java: error while loading shared libraries: libjli.so: cannot open shared object file: No such file or directory
Trying with parallel like usual:
parallel -j 1 --link --progress 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ \
-t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output' \
::: PGPC_0002_S1_L001_R1.fastq ::: PGPC_0002_S1_L001_R2.fastq
Also now not working? Trying on a file I know works:
parallel -j 1 --link --progress 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ \
-t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output' \
::: PGPC_0001_S2_L001_R1_001.fastq.gz ::: PGPC_0001_S2_L001_R2_001.fastq.gz
Decompressing gzipped file ...
Reformatting file sequence identifiers ...
Reformatting file sequence identifiers ...
Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifiersKJxCEj_decompressed_u3uQqc_PGPC_0001_S2_L001_R2_001 ): 58716973
Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifierskJy0nE_decompressed_JCmV4s_PGPC_0001_S2_L001_R1_001 ): 58716973
Running Trimmomatic ...
CRITICAL ERROR: Error executing: java -Xmx500m -jar /home/robyn/tools/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 20 -phred33 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifierskJy0nE_decompressed_JCmV4s_PGPC_0001_S2_L001_R1_001 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifiersKJxCEj_decompressed_u3uQqc_PGPC_0001_S2_L001_R2_001 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.single.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.2.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.single.2.fastq SLIDINGWINDOW:4:20 MINLEN:50
Error message returned from Trimmomatic :
java: error while loading shared libraries: libjli.so: cannot open shared object file: No such file or directory
local:0/1/100%/763.0s
Still having the java issue.
Removed trimmomatic and reinstalled with conda - kneaddata doesn’t find it this way so downloaded from binary on the website again Installed bowtie2 with conda Installed kneaddata with conda
Trying again:
parallel -j 1 --link --progress 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ \
-t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output' \
::: PGPC_0001_S2_L001_R1_001.fastq.gz ::: PGPC_0001_S2_L001_R2_001.fastq.gz
Making a new conda environment and seeing if that works:
conda create --name kneaddata
conda activate kneaddata
conda install -c bioconda kneaddata
Made a new environment with a fresh install and that worked fine. Kneaddata should be installed with conda and not pip (this also installs all dependencies). Participants 1-15 were run on vulcan while the others were run on the Compute Canada beluga and graham servers (details in separate R notebook).
Join reads:
import os
base_folder = 'kneaddata_all_participants'
participants = os.listdir(base_folder)
for participant in participants:
if 'PGPC' not in participant: continue
if 'PGPC_0001' in participant: continue
knead_folder = base_folder+'/'+participant+'/kneaddata_out/'
knead_out = os.listdir(knead_folder)
unique = []
for file in knead_out:
if '.fastq.gz' not in file: continue
if file.split('_kneaddata')[0] not in unique:
unique.append(file.split('_kneaddata')[0])
for file in unique:
r1_rename, r2_rename = file+'_kneaddata_paired_1.fastq.gz', file+'_kneaddata_paired_2.fastq.gz'
r1, r2 = file+'_kneaddata_paired_R1.fastq.gz', file+'_kneaddata_paired_R2.fastq.gz'
os.system('mv '+knead_folder+r1_rename+' '+knead_folder+r1)
os.system('mv '+knead_folder+r2_rename+' '+knead_folder+r2)
os.system('concat_paired_end.pl -p 4 -o '+base_folder+'/joined_reads/ '+knead_folder+file+'_kneaddata_paired_*.fastq.gz -f')
One file didn’t join:
concat_paired_end.pl -p 4 -o joined_reads/ PGPC_0087/kneaddata_out/*_kneaddata_paired_*.fastq.gz -f
Some files don’t have the same name format (missing zeroes before the participant number), fix that now:
import os
names_to_change = [51, 53, 54, 55, 56, 57, 59, 61, 62, 67, 69, 70, 71, 72, 73, 74, 76, 77, 78, 82, 87]
names_to_change = ['PGPC_'+str(name) for name in names_to_change]
list_files = os.listdir('joined_reads/')
for file in list_files:
for name in names_to_change:
if name in file:
new_fn = file.replace('PGPC_', 'PGPC_00')
string = 'mv joined_reads/'+file+' joined_reads/'+new_fn
os.system(string)
Join lanes:
import os
participants = range(2,87)
participants = ['PGPC_'+str(participant).zfill(4) for participant in participants]
all_files = os.listdir('joined_reads/')
for participant in participants:
#os.system('concat_lanes.pl joined_reads/'+participant+'* -o joined_lanes/ -p 4')
this_participant = [file for file in all_files if participant in file]
if len(this_participant) == 1:
os.system('cp joined_reads/'+this_participant[0]+' joined_lanes/')
Now rename files:
import os
files = sorted(os.listdir('joined_lanes/'))
for file in files:
number = file.split('_')[1]
new_name = 'PGPC_'+number+'.fastq.gz'
os.system('mv joined_lanes/'+file+' joined_lanes/'+new_name)
Looks like it wouldn’t be easy to filter out reads based on confidence afterwards so running all confidence parameters now.
parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 12 --db /scratch/ramdisk/Kraken2_GTDB_human_Dec2020/ --memory-mapping {1} --output kraken2_outraw/{1/.}_gtdb_{2}.kraken.txt --report kraken2_kreport/{1/.}_gtdb_{2}.kreport --confidence {2}) 2> times/time_{1/.}_gtdb_{2}.txt' ::: joined_lanes/*.fastq ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00
parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 12 --db /scratch/ramdisk/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93 --memory-mapping {1} --output kraken2_outraw/{1/.}_refseq_{2}.kraken.txt --report kraken2_kreport/{1/.}_refseq_{2}.kreport --confidence {2}) 2> times/time_{1/.}_refseq_{2}.txt' ::: joined_lanes/*.fastq ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00
Test on single file: Convert fastq to fasta:
sed -n '1~4s/^@/>/p;2~4p' joined_lanes/PGPC_0001.fastq > joined_lanes/PGPC_0001.fasta
Run dustmasker:
dustmasker -in joined_lanes/PGPC_0001.fasta -outfmt fasta -out joined_lanes/PGPC_0001_masked.fasta
Run with all files: Convert fastq to fasta:
import os
files = os.listdir('joined_lanes/')
files = [file for file in files if '.fastq' in file]
for file in files:
os.system("sed -n '1~4s/^@/>/p;2~4p' joined_lanes/"+file+" > joined_lanes/"+file.replace('fastq', 'fasta'))
files = os.listdir('joined_lanes/')
files = sorted([file for file in files if '.fasta' in file and 'masked' not in file])
for file in files:
if 'PGPC_0001' in file: continue
os.system('dustmasker -in joined_lanes/'+file+' -outfmt fasta -out joined_lanes/'+file.replace('.fasta', '_masked.fasta'))
This leaves the masked reads in the fasta file but in lower rather than upper case, so they need to be removed:
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
def get_upper(string):
upper = ''.join(c for c in string if c.isupper())
return upper
files = os.listdir('joined_lanes/')
files = sorted([file for file in files if 'masked' in file and 'removed' not in file])
for file in files:
records = SeqIO.parse('joined_lanes/'+file, "fasta")
new_records = []
for record in records:
upper = get_upper(str(record.seq))
if upper != '':
new_records.append(SeqRecord(Seq(upper), id=record.id, description=record.description))
SeqIO.write(new_records, 'joined_lanes/'+file.replace('.fasta', '_removed.fasta'), "fasta")
With the first file (PGPC_0001), this has file sizes of 1.3G > 742M > 749M > 425M for the .fastq > .fasta > _masked.fasta > _masked_removed.fasta
parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 24 --db /scratch/ramdisk/Kraken2_GTDB_human_Dec2020/ --memory-mapping {1} --output kraken2_outraw/{1/.}_gtdb_{2}.kraken.txt --report kraken2_kreport/{1/.}_gtdb_{2}.kreport --confidence {2}) 2> times/time_{1/.}_gtdb_{2}.txt' ::: joined_lanes/*_masked_removed.fasta ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00
parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 24 --db /scratch/ramdisk/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93 --memory-mapping {1} --output kraken2_outraw/{1/.}_refseq_{2}.kraken.txt --report kraken2_kreport/{1/.}_refseq_{2}.kreport --confidence {2}) 2> times/time_{1/.}_refseq_{2}.txt' ::: joined_lanes/*_masked_removed.fasta ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00
parallel -j 12 'bracken -d /scratch/ramdisk/Kraken2_GTDB_human_Dec2020/ -i {} -l S -o {.}.bracken -r 150' ::: kraken2_kreport/*_gtdb_*.kreport
parallel -j 12 'bracken -d /scratch/ramdisk/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93 -i {} -l S -o {.}.bracken -r 150' ::: kraken2_kreport/*_refseq_*.kreport
Get species and taxid/accession dictionaries:
folder_name = '/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/analysis/kraken2_kreport/'
results_folder = os.listdir(folder_name)
bracken = [result for result in results_folder if result[-8:] == '.bracken']
kreport = [result for result in results_folder if result[-15:] == 'bracken.kreport']
def kreport_gtdb(sp_dict, sp_dom_dict, gtdb_accession):
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/GTDB_human_db/RW_db_samples.tsv', header=0, index_col=0, sep='\t')
all_tax = set(taxa.loc[:, 'gtdb_taxonomy'].values)
count = 0
for tax in all_tax:
tax = tax.split(';')
new_tax = ''
for level in tax:
level = level.split('__')[1]
new_tax += level+';'
new_tax = new_tax[:-1]
sp_dict[new_tax.split(';')[-1]] = new_tax.split(';s__')[0]
for sp in sp_dict:
sp_dom_dict[sp] = sp_dict[sp].split(';')[0]
for row in taxa.index.values:
acc = row
count += 1
row = taxa.loc[row, :].values
species = row[0].split('s__')[1]
gtdb_accession[species] = acc
return sp_dict, sp_dom_dict, gtdb_accession
def kreport_refseq(result, sp_dict, sp_dom_dict, refeq_taxid):
keep_levels = ['D', 'P', 'C', 'O', 'F', 'G', 'S']
result = pd.read_csv(folder_name+result, sep='\t')
result = result.rename(columns={'100.00':'perc', list(result.columns)[1]:'N', '0':'N', 'R':'level', '1':'taxid', 'root':'classification'})
result = pd.DataFrame(result.loc[:, ['level', 'classification', 'taxid']])
current = {}
for lvl in keep_levels: current[lvl] = lvl
for i in result.index.values:
line = result.loc[i, :].values
line[1] = line[1].lstrip()
if line[0] not in keep_levels: continue
current[line[0]] = line[1]
if line[0] == 'S':
if line[1] not in refseq_taxid: refseq_taxid[line[1]] = line[2]
if line[1] in sp_dict: continue
tax = ''
for level in current:
if level == 'S': continue
if level != 'D': tax += ';'
tax += level.lower()+'__'+current[level]
sp_dict[line[1]] = tax
sp_dom_dict[line[1]] = tax.split(';')[0]
return sp_dict, sp_dom_dict, refeq_taxid
sp_dict, sp_dom_dict, refseq_taxid, gtdb_accession = {}, {}, {}, {}
sp_dict, sp_dom_dict, gtdb_accession = kreport_gtdb(sp_dict, sp_dom_dict, gtdb_accession)
for file in kreport:
if 'refseq' in file:
if '0.00' not in file: continue
sp_dict, sp_dom_dict, refseq_taxid = kreport_refseq(file, sp_dict, sp_dom_dict, refseq_taxid)
analysis = '/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/analysis/'
with open(analysis+'gtdb_accession.dict', 'wb') as f:
pickle.dump(gtdb_accession, f)
with open(analysis+'refseq_taxid.dict', 'wb') as f:
pickle.dump(refseq_taxid, f)
with open(analysis+'sp_dict.dict', 'wb') as f:
pickle.dump(sp_dict, f)
with open(analysis+'sp_dom_dict.dict', 'wb') as f:
pickle.dump(sp_dom_dict, f)
Run on server:
folder_name = 'kraken2_kreport/'
results_folder = os.listdir(folder_name)
bracken = [result for result in results_folder if result[-8:] == '.bracken']
participants, groups, confidence = [], [], []
reads = []
count = 0
for sample in bracken:
count += 1
result = pd.read_csv(folder_name+sample, sep='\t', header=0, index_col=0)
result = result.loc[:, ['new_est_reads']]
if 'gtdb' in sample:
rename_dict = {}
for row in result.index.values:
rename_dict[row] = row.split('__')[1]
result = result.rename(index=rename_dict)
ss = sample.split('_')
part = ss[0]+'-'+ss[1]
conf = ss[-1]
if count % 100 == 0: print(count)
if 'masked' in sample and 'refseq' in sample: db = 'refseq-masked'
elif 'refseq' in sample: db = 'refseq'
elif 'masked' in sample and 'gtdb' in sample: db = 'gtdb-masked'
else: db = 'gtdb'
sample_name = part+'_'+db+'_'+conf
if part not in participants: participants.append(part)
if db not in groups: groups.append(db)
if conf not in confidence: confidence.append(conf)
result = result.rename(columns={'new_est_reads':sample_name})
reads.append(result)
reads = pd.concat(reads).fillna(value=0)
reads = reads.groupby(by=reads.index, axis=0).sum()
cols = sorted(list(reads.columns.values))
reads = reads.loc[:, cols]
with open('sp_dom_dict.dict', 'rb') as f: sp_dom_dict = pickle.load(f)
domain_reads = reads.rename(index=sp_dom_dict)
domain_reads = domain_reads.groupby(by=domain_reads.index, axis=0).sum()
pgc = [participants, groups, confidence]
with open('pgc.list', 'wb') as f: pickle.dump(pgc, f)
reads.to_csv('reads.csv')
domain_reads.to_csv('domain_reads.csv')
with open(analysis+'gtdb_accession.dict', 'rb') as f: gtdb_accession = pickle.load(f)
with open(analysis+'refseq_taxid.dict', 'rb') as f: refseq_taxid = pickle.load(f)
with open(analysis+'sp_dict.dict', 'rb') as f: sp_dict = pickle.load(f)
with open(analysis+'sp_dom_dict.dict', 'rb') as f: sp_dom_dict = pickle.load(f)
with open(analysis+'pgc.list', 'rb') as f: pgc = pickle.load(f)
participants, groups, confidence = pgc[0], pgc[1], pgc[2]
reads = pd.read_csv(analysis+'reads.csv', index_col=0, header=0)
domain_reads = pd.read_csv(analysis+'domain_reads.csv', index_col=0, header=0)
reads_bacteria = pd.read_csv(analysis+'reads_bacteria.csv', index_col=0, header=0)
#
# if 'd__Bacteria' in domain_reads.index.values:
# rename_dict = {'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea', 'd__Eukaryota':'Eukaryota', 'Animalia':'Eukaryota', 'd__Viruses':'Viruses'}
# domain_reads = domain_reads.rename(index=rename_dict)
# domain_reads = domain_reads.groupby(by=domain_reads.index, axis=0).sum()
# domain_reads.to_csv(analysis+'domain_reads.csv')
#
# if '.bracken' in list(domain_reads.columns)[0]:
# rename_dict = {}
# for sample in domain_reads.columns:
# rename_dict[sample] = sample.replace('.bracken', '')
# domain_reads = domain_reads.rename(columns=rename_dict)
# reads = reads.rename(columns=rename_dict)
# domain_reads.to_csv(analysis+'domain_reads.csv')
# reads.to_csv(analysis+'reads.csv')
#
# confidence = [conf.replace('.bracken', '') for conf in confidence]
# pgc[2] = confidence
# with open(analysis+'pgc.list', 'wb') as f: pickle.dump(pgc, f)
def get_domain(reads_df, domain):
keeping = []
for sp in reads_df.index.values:
if sp_dom_dict[sp] in domain:
keeping.append(sp)
reads_return = reads_df.loc[keeping, :]
return reads_return
#reads_bacteria = get_domain(reads, ['Bacteria', 'd__Bacteria'])
#reads_bacteria.to_csv(analysis+'reads_bacteria.csv')
Check taxa overlap and get an idea of how many taxa we might need genomes for:
gtdb, refseq = [], []
for sample in reads.columns:
if 'gtdb' in sample: gtdb.append(sample)
elif 'refseq' in sample: refseq.append(sample)
reads_gtdb = reads.loc[:, gtdb]
reads_gtdb = reads_gtdb[reads_gtdb.max(axis=1) > 0]
reads_gtdb_red = reads_gtdb[reads_gtdb.max(axis=1) > 20]
reads_refseq = reads.loc[:, refseq]
reads_refseq = reads_refseq[reads_refseq.max(axis=1) > 0]
reads_refseq_red = reads_refseq[reads_refseq.max(axis=1) > 20]
tax_refseq, tax_gtdb = list(reads_refseq.index.values), list(reads_gtdb.index.values)
both = []
for tax in tax_refseq:
if tax in tax_gtdb:
both.append(tax)
print(len(tax_refseq), len(tax_gtdb), len(both))
print(reads_gtdb_red.shape[0], reads_refseq_red.shape[0])
reads_gtdb_perc = reads_gtdb.divide(reads_gtdb.sum(axis=0), axis=1).multiply(100)
reads_gtdb_red = reads_gtdb_perc[reads_gtdb_perc.max(axis=1) > 1]
reads_refseq_perc = reads_refseq.divide(reads_refseq.sum(axis=0), axis=1).multiply(100)
reads_refseq_red = reads_refseq_perc[reads_refseq_perc.max(axis=1) > 1]
print(reads_gtdb_red.shape[0], reads_refseq_red.shape[0])
The number of taxa classified using RefSeq is 14,494, using GTDB is 29,226 and that are the same species between the two is 5,254. If I remove taxa with fewer than a maximum of 10 reads in a sample, then there are 12,334 remaining in GTDB and 5,411 remaining in RefSeq. If I make this 20, there are 8,760 remaining in GTDB and 3,714 in RefSeq. If we use relative abundance instead and remove all with below 0.1%, there are 456 remaining in GTDB and 299 remaining in RefSeq. If I make this 1% then there are 128 remaining in GTDB and 96 remaining in RefSeq. (Note that this is based on all reads, including other domains, and I imagine will be different when looking only at the bacterial reads).
fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
db, x, data, xloc, count = groups[d], [], [], [], 1
for conf in confidence:
keeping = []
for sample in domain_reads.columns:
ss = sample.split('_')
if ss[1] == db and ss[2] == conf: keeping.append(sample)
domain_red = domain_reads.loc[:, keeping]
conf_bac, conf_human = list(domain_red.loc['Bacteria', :].values), list(domain_red.loc['Eukaryota'].values)
x.append(count), xloc.append(count+0.5), x.append(count+1), data.append(conf_bac), data.append(conf_human)
count += 2
bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
for patch, color in zip(bplot['boxes'], colors*21): patch.set_facecolor(color)
plt.sca(axes[d])
plt.ylim([0, 8000000])
if d == 1 or d == 3: plt.yticks([])
else: plt.ylabel('Number of reads')
if d < 2: plt.xticks([])
else: plt.xticks(xloc, confidence, rotation=90)
plt.title(groups[d])
handles = [Patch(facecolor=colors[0], edgecolor='k', label='Bacteria'), Patch(facecolor=colors[1], edgecolor='k', label='Eukaryota')]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1.05,1.05))
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
db, x, data, xloc, count = groups[d], [], [], [], 1
for conf in confidence:
keeping = []
for sample in domain_reads.columns:
ss = sample.split('_')
if ss[1] == db and ss[2] == conf: keeping.append(sample)
domain_red = domain_reads.loc[:, keeping]
conf_bac = list(domain_red.loc['Bacteria', :].values)
x.append(count), xloc.append(count), data.append(conf_bac)
count += 1
bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
plt.sca(axes[d])
plt.ylim([0, 4500000])
if d == 1 or d == 3: plt.yticks([])
else: plt.ylabel('Number of reads')
if d < 2: plt.xticks([])
else: plt.xticks(xloc, confidence, rotation=90)
plt.title(groups[d])
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
db, x, data, xloc, count = groups[d], [], [], [], 1
for conf in confidence:
keeping = []
for sample in domain_reads.columns:
ss = sample.split('_')
if ss[1] == db and ss[2] == conf: keeping.append(sample)
domain_red = domain_reads.loc[:, keeping]
domain_red = domain_red.divide(domain_red.sum(axis=0), axis=1).multiply(100)
conf_bac = list(domain_red.loc['Bacteria', :].values)
x.append(count), xloc.append(count), data.append(conf_bac)
count += 1
bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
plt.sca(axes[d])
plt.ylim([0, 100])
if d == 1 or d == 3: plt.yticks([])
else: plt.ylabel('Number of reads')
if d < 2: plt.xticks([])
else: plt.xticks(xloc, confidence, rotation=90)
plt.title(groups[d])
plt.tight_layout()
plt.show()
reads_bacteria_perc = reads_bacteria.divide(reads_bacteria.sum(axis=0), axis=1).multiply(100)
reads_bacteria_red = reads_bacteria_perc[reads_bacteria_perc.max(axis=1) > 0.1]
print(reads_bacteria_red.shape[0])
reads_bacteria_red = reads_bacteria_perc[reads_bacteria_perc.max(axis=1) > 1]
print(reads_bacteria_red.shape[0])
If we keep bacteria with > 1% abundance, we have 384 species left. If we keep bacteria with > 0.1% abundance, we have 1208 species left.
fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
db, x, data, xloc, count = groups[d], [], [], [], 1
for conf in confidence:
keeping = []
for sample in reads_bacteria.columns:
ss = sample.split('_')
if ss[1] == db and ss[2] == conf: keeping.append(sample)
domain_red = reads_bacteria.loc[:, keeping]
domain_red[domain_red > 0] = 1
domain_red = domain_red.sum(axis=0)
conf_bac = list(domain_red.values)
x.append(count), xloc.append(count), data.append(conf_bac)
count += 1
bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
plt.sca(axes[d])
#plt.semilogy()
if d == 1 or d == 3: plt.yticks([])
else: plt.ylabel('Number of species')
if d < 2: plt.xticks([]), plt.ylim([0, 6500])
else: plt.xticks(xloc, confidence, rotation=90), plt.ylim([0, 1200])
plt.title(groups[d])
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
db, x, data, xloc, count = groups[d], [], [], [], 1
for conf in confidence:
keeping = []
for sample in reads_bacteria.columns:
ss = sample.split('_')
if ss[1] == db and ss[2] == conf: keeping.append(sample)
domain_red = reads_bacteria.loc[:, keeping]
domain_red[domain_red > 0] = 1
domain_red = domain_red.sum(axis=0)
conf_bac = list(domain_red.values)
x.append(count), xloc.append(count), data.append(conf_bac)
count += 1
bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
plt.sca(axes[d])
#plt.ylim([0, 6500])
plt.semilogy()
#if d == 1 or d == 3: plt.yticks([])
#else: plt.ylabel('Number of species')
if d == 0 or d == 2: plt.ylabel('Number of species')
if d < 2: plt.xticks([])
else: plt.xticks(xloc, confidence, rotation=90)
plt.title(groups[d])
plt.tight_layout()
plt.show()
Bray-Curtis of bacteria only at species level
if os.path.exists(analysis+'npos_bacteria.df'):
with open(analysis+'npos_bacteria.df', 'rb') as f: npos = pickle.load(f)
else:
pos, npos, stress = transform_for_NMDS(reads_bacteria.transpose())
with open(analysis+'npos_bacteria.df', 'wb') as f: pickle.dump(npos, f)
fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5, ax6 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325), plt.subplot(326)
colormap = mpl.cm.get_cmap('winter', 256)
norm = mpl.colors.Normalize(vmin=0, vmax=21)
m1 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
colors = [m1.to_rgba(a) for a in range(21)]
colors_conf = {}
for c in range(len(confidence)):
colors_conf[confidence[c]] = colors[c]
colors_db = {'gtdb':'#1A5276', 'gtdb-masked':'#5499C7', 'refseq':'#E67E22', 'refseq-masked':'#F7DC6F'}
for a in range(len(reads_bacteria.columns)):
sample = list(reads_bacteria.columns)[a].split('_')
t1, t2, t3, t4 = 0.1, 0.1, 0.1, 0.1
if sample[1] == 'gtdb': t1 = 1
elif sample[1] == 'gtdb-masked': t2 = 1
elif sample[1] == 'refseq': t3 = 1
elif sample[1] == 'refseq-masked': t4 = 1
ax1.scatter(npos[a,0], npos[a,1], color=colors_db[sample[1]], alpha=0.6, edgecolor='k')
ax2.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=0.6, edgecolor='k')
ax3.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t1, edgecolor='k')
ax4.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t2, edgecolor='k')
ax5.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t3, edgecolor='k')
ax6.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t4, edgecolor='k')
ax = [ax1, ax2, ax3, ax4, ax5, ax6]
titles = ['Coloured by database', 'Coloured by confidence', 'GTDB coloured by confidence', 'GTDB-masked coloured by confidence', 'RefSeq coloured by confidence', 'RefSeq-masked coloured by confidence']
for a in range(len(ax)):
ax[a].set_title(titles[a])
ax[a].set_xlabel('NMDS1')
ax[a].set_ylabel('NMDS2')
handles = [Patch(facecolor=colors_db[db], edgecolor='k', label=db) for db in colors_db]
ax1.legend(handles=handles, loc='best')
handles = [Patch(facecolor=colors_conf[conf], edgecolor='k', label=conf) for conf in confidence]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1.05,1.0))
plt.tight_layout()
plt.show()
Bray-Curtis of bacteria only at genus level
if os.path.exists(analysis+'npos_bacteria_genus.df'):
with open(analysis+'npos_bacteria_genus.df', 'rb') as f: npos = pickle.load(f)
reads_bacteria_genus = pd.read_csv(analysis+'reads_bacteria_genus.csv', header=0, index_col=0)
else:
species = list(reads_bacteria.index.values)
gen_dict = {}
count = 0
for sp in species:
count += 1
gen = sp_dict[sp].split(';')[-2]
if '__' in gen: gen = gen.split('__')[1]
gen = gen.lstrip().rstrip()
gen_dict[sp] = gen
reads_bacteria_genus = reads_bacteria.rename(index=gen_dict)
reads_bacteria_genus = reads_bacteria_genus.groupby(by=reads_bacteria_genus.index, axis=0).sum()
pos, npos, stress = transform_for_NMDS(reads_bacteria_genus.transpose())
with open(analysis+'npos_bacteria_genus.df', 'wb') as f: pickle.dump(npos, f)
reads_bacteria_genus.to_csv(analysis+'reads_bacteria_genus.csv')
fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5, ax6 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325), plt.subplot(326)
colormap = mpl.cm.get_cmap('winter', 256)
norm = mpl.colors.Normalize(vmin=0, vmax=21)
m1 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
colors = [m1.to_rgba(a) for a in range(21)]
colors_conf = {}
for c in range(len(confidence)):
colors_conf[confidence[c]] = colors[c]
colors_db = {'gtdb':'#1A5276', 'gtdb-masked':'#5499C7', 'refseq':'#E67E22', 'refseq-masked':'#F7DC6F'}
for a in range(len(reads_bacteria_genus.columns)):
sample = list(reads_bacteria_genus.columns)[a].split('_')
t1, t2, t3, t4 = 0.1, 0.1, 0.1, 0.1
if sample[1] == 'gtdb': t1 = 1
elif sample[1] == 'gtdb-masked': t2 = 1
elif sample[1] == 'refseq': t3 = 1
elif sample[1] == 'refseq-masked': t4 = 1
ax1.scatter(npos[a,0], npos[a,1], color=colors_db[sample[1]], alpha=0.6, edgecolor='k')
ax2.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=0.6, edgecolor='k')
ax3.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t1, edgecolor='k')
ax4.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t2, edgecolor='k')
ax5.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t3, edgecolor='k')
ax6.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t4, edgecolor='k')
ax = [ax1, ax2, ax3, ax4, ax5, ax6]
titles = ['Coloured by database', 'Coloured by confidence', 'GTDB coloured by confidence', 'GTDB-masked coloured by confidence', 'RefSeq coloured by confidence', 'RefSeq-masked coloured by confidence']
for a in range(len(ax)):
ax[a].set_title(titles[a])
ax[a].set_xlabel('NMDS1')
ax[a].set_ylabel('NMDS2')
handles = [Patch(facecolor=colors_db[db], edgecolor='k', label=db) for db in colors_db]
ax1.legend(handles=handles, loc='best')
handles = [Patch(facecolor=colors_conf[conf], edgecolor='k', label=conf) for conf in confidence]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1.05,1.0))
plt.tight_layout()
plt.show()